# Importing the required libraries for data manipulation, plotting, and modelling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from scipy.stats import ttest_ind, chi2_contingency, f_oneway
import streamlit as st
# Enables interactive Plotly charts inside Jupyter Notebook
import plotly.io as pio
pio.renderers.default = 'notebook_connected'
import plotly.io as pio
pio.renderers.default = 'notebook'
# Ensures that matplotlib plots display directly below the code cell in Jupyter Notebook
%matplotlib inline
import pandas as pd
# Loading the dataset
df = pd.read_csv("heart.csv")
# Checking for missing values
print(df.isnull().sum())
Age 0 Sex 0 ChestPainType 0 RestingBP 0 Cholesterol 0 FastingBS 0 RestingECG 0 MaxHR 0 ExerciseAngina 0 Oldpeak 0 ST_Slope 0 HeartDisease 0 dtype: int64
As this dataset has no missing values, we are proceeding with further cleaning processes such as (One-Hot Coding and Standard Scaler).
# Prints all the columns in the dataset
print(df.columns)
Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
'HeartDisease'],
dtype='object')
import pandas as pd
# Loading the dataset
df = pd.read_csv("heart.csv")
df.columns = df.columns.str.strip().str.lower() #removing the whitespaces and converting the column names to lowercase
# Identifying the categorical columns from the dataset
categorical_cols = df.select_dtypes(include='object').columns.tolist()
# Applying one-hot encoding using pandas
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
# Displaying the output
print("Original shape:", df.shape)
print("Encoded shape:", df_encoded.shape)
df_encoded.head()
Original shape: (918, 12) Encoded shape: (918, 21)
| age | restingbp | cholesterol | fastingbs | maxhr | oldpeak | heartdisease | sex_F | sex_M | chestpaintype_ASY | ... | chestpaintype_NAP | chestpaintype_TA | restingecg_LVH | restingecg_Normal | restingecg_ST | exerciseangina_N | exerciseangina_Y | st_slope_Down | st_slope_Flat | st_slope_Up | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 140 | 289 | 0 | 172 | 0.0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| 1 | 49 | 160 | 180 | 0 | 156 | 1.0 | 1 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| 2 | 37 | 130 | 283 | 0 | 98 | 0.0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 3 | 48 | 138 | 214 | 0 | 108 | 1.5 | 1 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 4 | 54 | 150 | 195 | 0 | 122 | 0.0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
5 rows × 21 columns
Interpretation: This dataset has certain categorical features such as chestpain_type, sex of the patient, restingecg, slope_ST. And we cannot build machine learning models such as logistic regression, random Forest and XGBoost with these categorical features in it. Therefore, we apply One-Hot Coding to convert all the categorical features into numerical features. Thus making it easier to train our models and keep our categories equally weighted and unordered.
print(df_encoded.dtypes)
age int64 restingbp int64 cholesterol int64 fastingbs int64 maxhr int64 oldpeak float64 heartdisease int64 sex_F uint8 sex_M uint8 chestpaintype_ASY uint8 chestpaintype_ATA uint8 chestpaintype_NAP uint8 chestpaintype_TA uint8 restingecg_LVH uint8 restingecg_Normal uint8 restingecg_ST uint8 exerciseangina_N uint8 exerciseangina_Y uint8 st_slope_Down uint8 st_slope_Flat uint8 st_slope_Up uint8 dtype: object
Now, we can see that all the categorical features have been converted to numerical features.
#importing the required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
#Loading the dataset
df = pd.read_csv("heart.csv")
#Identifying the features to scale (excluding the target column)
features_to_scale = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
target_column = 'HeartDisease'
#Initialization of scaler
scaler = StandardScaler()
#Fitting and transforming the numerical features
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
#Displaying the scaled data
print(df.head())
Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG \
0 -1.433140 M ATA 0.410909 0.825070 0 Normal
1 -0.478484 F NAP 1.491752 -0.171961 0 Normal
2 -1.751359 M ATA -0.129513 0.770188 0 ST
3 -0.584556 F ASY 0.302825 0.139040 0 Normal
4 0.051881 M NAP 0.951331 -0.034755 0 Normal
MaxHR ExerciseAngina Oldpeak ST_Slope HeartDisease
0 1.382928 N -0.832432 Up 0
1 0.754157 N 0.105664 Flat 1
2 -1.525138 N -0.832432 Up 0
3 -1.132156 Y 0.574711 Flat 1
4 -0.581981 N -0.832432 Up 0
This above code loads the dataset and applies standard scaling to selected numerical features (Age, RestingBP, Cholesterol, MaxHR, and Oldpeak). Using StandardScaler, we transforms each feature to have mean = 0 and standard deviation = 1, which helps improve the performance of many machine learning models.
# Loading the dataset
df = pd.read_csv('heart.csv')
# Show the first five rows
print(df.head())
Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR \ 0 40 M ATA 140 289 0 Normal 172 1 49 F NAP 160 180 0 Normal 156 2 37 M ATA 130 283 0 ST 98 3 48 F ASY 138 214 0 Normal 108 4 54 M NAP 150 195 0 Normal 122 ExerciseAngina Oldpeak ST_Slope HeartDisease 0 N 0.0 Up 0 1 N 1.0 Flat 1 2 N 0.0 Up 0 3 Y 1.5 Flat 1 4 N 0.0 Up 0
import pandas as pd
# Loading the dataset
df = pd.read_csv('heart.csv')
# standardize the column names to lowercase
df.columns = df.columns.str.strip().str.lower()
# Shows total number of patient records
print(f"Total patient records: {len(df)}")
# Counts how many patients have heart disease (value == 1)
count_hd = (df['heartdisease'] == 1).sum()
print(f"Patients with heart disease: {count_hd}")
# Counts how many patients do not have heart disease (value == 0)
no_hd_count = (df['heartdisease'] == 0).sum()
print(f"Patients with no heart disease: {no_hd_count}")
Total patient records: 918 Patients with heart disease: 508 Patients with no heart disease: 410
# Summary statistics
print(df.describe())
age restingbp cholesterol fastingbs maxhr \
count 918.000000 918.000000 918.000000 918.000000 918.000000
mean 53.510893 132.396514 198.799564 0.233115 136.809368
std 9.432617 18.514154 109.384145 0.423046 25.460334
min 28.000000 0.000000 0.000000 0.000000 60.000000
25% 47.000000 120.000000 173.250000 0.000000 120.000000
50% 54.000000 130.000000 223.000000 0.000000 138.000000
75% 60.000000 140.000000 267.000000 0.000000 156.000000
max 77.000000 200.000000 603.000000 1.000000 202.000000
oldpeak heartdisease
count 918.000000 918.000000
mean 0.887364 0.553377
std 1.066570 0.497414
min -2.600000 0.000000
25% 0.000000 0.000000
50% 0.600000 1.000000
75% 1.500000 1.000000
max 6.200000 1.000000
Explanation of each row name:
| Row Name | Description |
|---|---|
| count | Number of non-missing (non-NaN) entries in the column. |
| mean | The average (arithmetic mean) value of the column. |
| std | Standard deviation — how much the values vary from the mean. |
| min | Minimum value in the column. |
| 25% | 1st quartile (Q1) — 25% of values are below this. |
| 50% | Median (Q2) — 50% of values are below this. |
| 75% | 3rd quartile (Q3) — 75% of values are below this. |
| max | Maximum value in the column. |
# importing the required libraries
import plotly.graph_objects as go
import plotly.io as pio
# Set renderer to show plots in notebook
pio.renderers.default = 'notebook'
# Location coordinates: [latitude, longitude]
locations = {
'Cleveland Clinic, USA': [41.4993, -81.6944],
'VA Medical Center, USA': [33.7817, -118.1880],
'Budapest, Hungary': [47.4979, 19.0402],
'Zurich, Switzerland': [47.3769, 8.5417]
}
# Preparing the coordinates
lats = [loc[0] for loc in locations.values()]
lons = [loc[1] for loc in locations.values()]
names = list(locations.keys())
# Markers
flags = go.Scattergeo(
lon=lons,
lat=lats,
text=names,
mode='markers+text',
marker=dict(size=10, color='red'),
textposition='top center'
)
# marking the lines between each pair of locations
lines = []
loc_values = list(locations.values())
for i in range(len(loc_values)):
for j in range(i + 1, len(loc_values)):
lines.append(
go.Scattergeo(
lon=[loc_values[i][1], loc_values[j][1]],
lat=[loc_values[i][0], loc_values[j][0]],
mode='lines',
line=dict(width=1, color='blue'),
showlegend=False
)
)
# Creating the figure
fig = go.Figure(data=[flags] + lines)
# drawing the layout for globe
fig.update_layout(
title='Visualization of the places of the aquisation of the Heart Dataset',
showlegend=False,
geo=dict(
projection_type='orthographic',
showland=True,
landcolor='lightgray',
oceancolor='lightblue',
showocean=True,
countrycolor='white',
showcoastlines=True,
coastlinecolor='gray'
)
)
# displaying the globe
fig.show()
# importing the required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Loading and cleaning the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# Defining the (quantitative) variables
continuous_vars = ['age', 'restingbp', 'cholesterol', 'maxhr', 'oldpeak']
# Plotting boxplots
plt.figure(figsize=(15, 8))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 3, i + 1)
sns.boxplot(x=df[var], color='skyblue')
plt.title(f'Boxplot of {var.capitalize()}')
# displaying the output
plt.tight_layout()
plt.show()
# importing the required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Loading and cleaning the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# Defining the categorical variables
categorical_vars = ['sex', 'chestpaintype', 'restingecg', 'exerciseangina', 'st_slope']
# Plotting countplots
plt.figure(figsize=(15, 10))
for i, var in enumerate(categorical_vars):
plt.subplot(2, 3, i + 1)
sns.countplot(data=df, x=var, palette='Set2')
plt.title(f'Count Plot of {var.capitalize()}')
plt.xticks(rotation=30)
# displaying the output
plt.tight_layout()
plt.show()
Interpretation of output:
Univariate analysis focuses on analyzing one variable at a time, helping us understand its distribution, central tendency, variability, and potential issues (like outliers or skewness).
Numerical Variables (e.g., age, cholesterol, restingbp, oldpeak, maxhr) Boxplots shows:
Categorical Variables (e.g., sex, chestpaintype, restingecg, exerciseangina, heartdisease) Count plots shows:
# defining the Numerical features
numerical_vars = ['age', 'cholesterol', 'restingbp', 'maxhr', 'oldpeak']
# plotting the figure
plt.figure(figsize=(15, 10))
for i, var in enumerate(numerical_vars):
plt.subplot(2, 3, i + 1)
sns.violinplot(data=df, x='heartdisease', y=var, palette='Pastel1')
plt.title(f'{var.capitalize()} by Heart Disease')
plt.xlabel('Heart Disease')
plt.ylabel(var.capitalize())
# displaying the output
plt.tight_layout()
plt.show()
Interpretation of output:
The above violin plots show the distribution and density of numerical features (like age, cholesterol, etc.) across the two target classes:
0 = No heart disease 1 = Has heart disease
These insights help us to understand how each feature relates to the likelihood of heart disease.
1. Age vs. Heart Disease
People with heart disease (1) tend to be older on average. Violin plot for class 1 is often shifted to higher ages.
2. Cholesterol vs. Heart Disease
The distribution can be wide and overlapping, meaning cholesterol alone is not a strong separator. Possible outliers may appear as long tails or thick bulges.
3. RestingBP vs. Heart Disease
Similar to cholesterol, may not have a clear separation. Still useful in combination with other variables (e.g., oldpeak or age).
4. MaxHR vs. Heart Disease
Lower MaxHR is often seen in heart disease patients. You may see the violin for class 1 more concentrated at lower heart rates.
5. Oldpeak vs. Heart Disease
This is usually a strong indicator. People with heart disease (1) often have higher oldpeak values (seen as wider top in class 1).
# KDE plot
# plotting the figure
plt.figure(figsize=(15, 10))
for i, var in enumerate(numerical_vars):
plt.subplot(2, 3, i + 1)
sns.kdeplot(data=df, x=var, hue='heartdisease', fill=True, common_norm=False, palette='Set1', alpha=0.4)
plt.title(f'KDE: {var.capitalize()} by Heart Disease')
plt.xlabel(var.capitalize())
plt.ylabel('Density')
# displaying the output
plt.tight_layout()
plt.show()
Interpretation of output:
The KDE plot code is performing bivariate analysis using Kernel Density Estimation (KDE) to visualize the distribution of each numerical variable across the two classes of heartdisease.
This KDE plot does:
Finding insights from the output: 1. Shape & Overlap If the KDE curves for heart disease = 0 and 1 overlap heavily, the variable may not be a strong discriminator. Less overlap means the variable helps distinguish between the two groups.
2. Peak Shifts If one group has a peak at a higher or lower value, it indicates a trend: Example: oldpeak peaks at higher values for heart disease patients. maxhr peaks at higher values for healthy individuals.
| Feature | Distribution Insight | Predictive Strength |
|---|---|---|
Age |
Older patients more likely have heart disease | Moderate |
Cholesterol |
Broad overlap; not a strong separator | Low |
RestingBP |
Overlapping curves; low separation | Low |
MaxHR |
Lower max HR seen in heart disease patients | Moderate to Strong |
Oldpeak |
Higher oldpeak values linked to heart disease | Strong |
# Bar chart: Age vs Heart-disease
# Loading the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower() # removal of unwanted whitespaces and setting the column names to lower case
# Group by age and heart disease status
age_hd_counts = df.groupby(['age', 'heartdisease']).size().unstack(fill_value=0)
# Plotting the graph
age_hd_counts.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='coolwarm') # <- reduced size
plt.title('Bar chart: Heart Disease Frequency by Age')
plt.xlabel('Age')
plt.ylabel('Number of People')
plt.legend(['No Heart Disease', 'Heart Disease'])
plt.tight_layout()
plt.show()
Interpretation:
Bars are stacked with:
Insights drawn:
1. Age Groups with Highest Cases:
Some age groups (e.g., 52, 54, 58, 60) may have taller bars, meaning more people in that age range are in the dataset. We can identify peak heart disease ages if the upper segment (disease = 1) dominates.
2. Heart Disease Tends to Increase with Age:
In many datasets, you’ll notice that older age groups (50–65+) have more red or dark-colored segments → indicating more cases of heart disease. Younger ages (e.g., 30s, 40s) have smaller bars and often more blue → less likely to have heart disease.
3. Comparing Within Age Groups:
Within any specific age, if the red segment (disease = 1) is taller than blue, that age has more heart disease patients. Some mid-age groups might have balanced bars → indicating mixed risk.
# Scatter plot: oldpeak vs heart-disease
plt.figure(figsize=(10, 5))
sns.stripplot(data=df, x='heartdisease', y='oldpeak', jitter=True, palette='Set2')
plt.title('Scatter plot: Oldpeak vs Heart Disease')
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Oldpeak (ST Depression after Exercise)')
plt.grid(True)
plt.show()
Interpretation of output:
Plot details:
Points: Each point is a patient, and their vertical position shows their oldpeak value.
jitter=True: Adds horizontal spread to prevent points from overlapping.
Color: Different hues (from Set2) for visual distinction.
Insights drawn:
1. Patients Without Heart Disease (x = 0):
2. Patients With Heart Disease (x = 1):
# Violin plot: age vs chestpain
# Loading the dataset and fixing the columns
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# defining names of chest pain
chest_pain_map = {
'TA': 'Typical Angina',
'ATA': 'Atypical Angina',
'NAP': 'Non-Anginal Pain',
'ASY': 'Asymptomatic'
}
df['chestpaintype'] = df['chestpaintype'].map(chest_pain_map)
# Dropping rows with missing age or chest paintype
df = df.dropna(subset=['chestpaintype', 'age'])
# displaying the Violin plot
plt.figure(figsize=(8, 6))
sns.violinplot(data=df, x='chestpaintype', y='age', palette='Set2')
plt.title('Age Distribution per Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.ylabel('Age')
plt.tight_layout()
plt.show()
Interpretation of output:
This violin plot visualizes how age is distributed across different types of chest pain, helping identify patterns in age-related chest pain presentations among heart patients.
Plot details:
x-axis: chestpaintype (Mapped categories: Typical Angina, Atypical Angina, Non-Anginal Pain, Asymptomatic)
y-axis: age of the patients
Violin Shape: Shows distribution, density, and spread of age for each chest pain category.
Insights drawn:
| Chest Pain Type | Age Distribution Insight |
|---|---|
| Typical Angina | More common in older adults |
| Atypical Angina | Occurs in mid-age groups |
| Non-Anginal Pain | Wide range of ages affected |
| Asymptomatic | Seen mostly in older adults, may indicate silent risk |
# Beeswarm plot: Cholestrol vs Heart-disease
# Loading the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower() # removal of unwanted whitespaces
# Ensuring correct data types
df['heartdisease'] = pd.to_numeric(df['heartdisease'], errors='coerce')
df['cholesterol'] = pd.to_numeric(df['cholesterol'], errors='coerce')
# Dropping missing values
df = df.dropna(subset=['cholesterol', 'heartdisease'])
# Plotting the graph
plt.figure(figsize=(8, 6))
sns.swarmplot(data=df, x='heartdisease', y='cholesterol', palette='Set2')
plt.title('Beeswarm plot: Cholesterol Levels vs Heart Disease')
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
plt.ylabel('Cholesterol (mg/dL)')
plt.tight_layout()
plt.show()
C:\Users\Sharon Karabel\anaconda3\lib\site-packages\seaborn\categorical.py:1296: UserWarning: 23.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
Interpretation of output:
The beeswarm plot (a variation of a scatter plot with jittering) visualizes how cholesterol levels are distributed among patients with and without heart disease.
Plot details:
x-axis: heartdisease
y-axis: cholesterol (in mg/dL)
Each dot: A single patient’s cholesterol level. Points are spread out horizontally to avoid overlap, giving a "beeswarm" look.
Insights drawn:
| Observation | Interpretation |
|---|---|
Wide overlap between class 0 and 1 |
Cholesterol is not a standalone discriminator |
| High values in both classes | Some patients have high cholesterol but no disease |
| Spread is more vertical than grouped | Cholesterol varies widely within both groups |
# Heatmap for Heart-disease
# Loading the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower() # removal of unwanted whitespaces
# Selection of numeric columns
numeric_df = df.select_dtypes(include='number')
# Computation of the correlation matrix
corr_matrix = numeric_df.corr()
# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title("Correlation Matrix - Heart Disease Dataset", fontsize=14)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
Interpretation of output:
This heatmap visualizes the correlation coefficients between all numeric variables in the dataset, helping identify relationships that are:
The value shown at the intersection of two variables is the Pearson correlation coefficient. Color:
1. Dark red = strong positive correlation 2. Dark blue = strong negative correlation 3. Lighter shades = weak or no correlation
The diagonal is always 1.00 because each variable is perfectly correlated with itself.
Insights drawn:
| Feature | Correlation | Meaning |
|---|---|---|
oldpeak |
+0.40 | Higher ST depression = more likely heart disease |
sex |
+0.30 | Males (usually coded as 1) more likely to have heart disease |
chestpaintype |
+0.28 | Some chest pain types strongly associated with heart disease |
fastingbs |
+0.26 | High fasting blood sugar may indicate heart risk |
exerciseangina |
–0.49 | Presence of exercise-induced angina reduces heart disease likelihood (inverse) |
maxhr |
–0.40 | Higher maximum heart rate = lower risk (healthy heart performance) |
age |
–0.22 | Mild negative relation — older people slightly less represented (data-specific) |
cholesterol |
–0.06 | Very weak relationship — cholesterol alone isn't a good predictor here |
import pandas as pd
# Load data
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# Rename for consistency
df.rename(columns={'chestpaintype': 'ChestPainType', 'heartdisease': 'HeartDisease'}, inplace=True)
# Map HeartDisease values for clarity
df['HeartDisease'] = df['HeartDisease'].map({0: 'No Disease', 1: 'Heart Disease'})
# Group and count
grouped_counts = df.groupby(['ChestPainType', 'HeartDisease']).size().unstack(fill_value=0)
# Add a total column
grouped_counts['Total'] = grouped_counts.sum(axis=1)
# Display result
print(grouped_counts)
HeartDisease Heart Disease No Disease Total ChestPainType ASY 392 104 496 ATA 24 149 173 NAP 72 131 203 TA 20 26 46
# Sankey Diagram (chestpain vs heart disease)
# importing the required libraries
import pandas as pd
import plotly.graph_objects as go
# Loading the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower() # removal of unwanted whitespaces
# Renaming for consistency
df.rename(columns={'chestpaintype': 'ChestPainType', 'heartdisease': 'HeartDisease'}, inplace=True)
# Mapping HeartDisease 0/1 to readable labels
df['HeartDisease'] = df['HeartDisease'].map({0: 'No Disease', 1: 'Heart Disease'})
# Grouping of data
grouped = df.groupby(['ChestPainType', 'HeartDisease']).size().reset_index(name='count')
# Preparing the labels
chest_labels = grouped['ChestPainType'].unique().tolist()
disease_labels = ['No Disease', 'Heart Disease']
labels = chest_labels + disease_labels
label_indices = {label: idx for idx, label in enumerate(labels)}
# Creating source, target, and values
source = [label_indices[row['ChestPainType']] for _, row in grouped.iterrows()]
target = [label_indices[row['HeartDisease']] for _, row in grouped.iterrows()]
value = grouped['count'].tolist()
# red links for heart disease
link_colors = ['red' if labels[t] == 'Heart Disease' else 'lightgray' for t in target]
# plotting Sankey Diagram
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=labels,
color="lightblue"
),
link=dict(
source=source,
target=target,
value=value,
color=link_colors
)
)])
# displaying the diagram
fig.update_layout(
title_text="Sankey Diagram: Chest Pain Type → Heart Disease Outcome",
font_size=12
)
fig.show()
Interpretation of output:
The Sankey diagram illustrates the movement of patients experiencing various chest pain types to their corresponding heart disease results. This type of visualization is superb for displaying ratios and shifts among categorical characteristics.
Plot details:
Left nodes: Different chest pain types (e.g., ASY, ATA, NAP, TA)
Right nodes: Heart disease outcomes:
Links (flows):
Width = number of patients
Color: Red if the outcome is heart disease and gray if the outcomme is no heart disease
Insights drawn:
1. Asymptomatic (ASY): The largest red flow likely comes from ASY, indicating a high proportion of heart disease among asymptomatic patients.
Result: Many people with no noticeable symptoms (ASY) still have heart disease — a major risk indicator.
2. Atypical Angina (ATA) and Non-Anginal Pain (NAP) These chest pain types have more balanced flows, with both red and gray.
Result: ATA and NAP occur in both groups but tend to lean slightly more toward no disease.
3. Typical Angina (TA): May have fewer overall cases in the dataset. Possibly a mix of both outcomes or slightly skewed toward one.
# Simulated ECG Plot based on restingecg, oldpeak, maxhr
# importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Loading the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower() # removal of unwanted whitespaces
# Renaming and cleaning
df.rename(columns={'heartdisease': 'HeartDisease'}, inplace=True)
# Selection of 4 sample rows from the dataset
sample_df = df.sample(4, random_state=42) # 2 with, 2 without disease
# generating a Simulated ECG waveform function
def generate_ecg_signal(heart_rate, oldpeak, rest_ecg, length=1.0, fs=250):
t = np.linspace(0, length, int(length * fs))
baseline = np.sin(2 * np.pi * heart_rate / 60 * t) * np.exp(-2 * t)
if rest_ecg == 'LVH':
baseline += 0.3 * np.sin(8 * np.pi * t)
if rest_ecg == 'ST':
baseline += 0.2
baseline -= oldpeak * 0.2
return t, baseline
# Plot setup
fig, axs = plt.subplots(2, 2, figsize=(12, 6))
axs = axs.flatten()
for i, (idx, row) in enumerate(sample_df.iterrows()):
hr = row['maxhr'] if 'maxhr' in row else 80
oldpeak = row['oldpeak'] if 'oldpeak' in row else 0
rest_ecg = row['restingecg'] if 'restingecg' in row else 'Normal'
disease = row['HeartDisease']
t, ecg = generate_ecg_signal(hr, oldpeak, rest_ecg)
axs[i].plot(t, ecg, color='red' if disease == 1 else 'green')
axs[i].set_title(f"Patient {idx} | {'❤️ Disease' if disease == 1 else ' No Disease'}")
axs[i].set_xlabel("Time (s)")
axs[i].set_ylabel("Voltage")
axs[i].grid(True)
# displaying the plots
plt.tight_layout()
plt.suptitle("Simulated ECG Plot", fontsize=16, y=1.02)
plt.show()
Interpretation:
We have taken 4 random samples from the dataset to setup the simulated ECG plot based on the maximum heart-rate(maxhr), oldpeak, restingecg. It helps us to visualize patient conditions and simulate how ECG readings might vary with different risk indicators.
Plot details:
Each subplot represents one patient's simulated ECG waveform.
Color:
🔴 Red: Patient has heart disease (HeartDisease = 1)
🟢 Green: No heart disease (HeartDisease = 0)
The waveform is generated using:
maxhr: Affects the frequency (how fast the heart beats)
oldpeak: ST depression, reduces the baseline
restingecg:
'Normal': Baseline sinusoidal
'LVH': Adds abnormal sinus component (left ventricular hypertrophy)
'ST': Adds an offset (ST abnormality)
Observations:
1. Waveform Shape: Healthy patients (green) generally have a smooth and regular waveform.
Heart disease patients (red) often show:
2. Heart Rate Impact: Higher maxhr results in faster oscillations (tighter waveform cycles). You can visually compare heart rhythm frequency between patients.
3. Disease vs. Non-Disease: Simulated ECGs for patients with heart disease may show:
#2D - Density plot: Maxhr vs oldpeak
# Loading the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# Checking for column names
if 'maxhr' not in df.columns or 'oldpeak' not in df.columns:
raise ValueError("Columns 'maxhr' or 'oldpeak' not found in dataset.")
# Creating the 2D KDE plot
plt.figure(figsize=(8, 6))
sns.kdeplot(
data=df,
x='maxhr',
y='oldpeak',
fill=True,
cmap='Reds',
thresh=0.05,
levels=100
)
# Adding labels, title and displaying the plot
plt.title('2D Density Plot: MaxHR vs Oldpeak')
plt.xlabel('Max Heart Rate Achieved')
plt.ylabel('Oldpeak (ST Depression)')
plt.grid(True)
plt.tight_layout()
plt.show()
Interpretation:
This 2D KDE (Kernel Density Estimate) plot visualizes the joint distribution between:
maxhr (Maximum Heart Rate Achieved)
oldpeak (ST depression induced by exercise)
It shows where patients are concentrated in terms of these two important clinical variables.
Plot details:
X-axis: maxhr – how high the patient's heart rate went
Y-axis: oldpeak – a measure of stress-induced ST depression
Color Intensity: Shows density of data points
Darker red = more patients in that region
Lighter/white = fewer or no patients
Observations:
1. Densest Region (Hotspot) Most patients cluster around:
maxhr between 130–170 bpm
oldpeak around 0 to 1
This suggests the majority of patients had normal ST response and moderately high heart rates during stress.
2. Rare Combinations Very few patients have:
oldpeak > 3 (severe ST depression)
maxhr < 100 (inability to reach high heart rate)
These could be potential high-risk cases, but rare in the dataset.
We might notice that as oldpeak increases, maxhr tends to decrease slightly, indicating possible heart dysfunction or exercise intolerance.
plt.figure(figsize=(10, 6))
plt.hexbin(df['restingbp'], df['oldpeak'], gridsize=25, cmap='magma', bins='log')
plt.colorbar(label='log10(count)')
plt.xlabel('Resting Blood Pressure (mm Hg)')
plt.ylabel('Oldpeak (ST Depression)')
plt.title('Hexbin Plot: Resting BP vs Oldpeak')
plt.grid(alpha=0.3)
plt.show()
Interpretation:
The hexbin plot is a two-dimensional histogram using hexagonal bins to show the density of data points, especially useful when visualizing large datasets with overlapping values.
Plot details:
X-axis: restingbp — Resting Blood Pressure (in mm Hg)
Y-axis: oldpeak — ST Depression induced by exercise
Color Intensity (from the 'magma' colormap):
Represents log-scaled frequency of data points in each bin
Brighter/yellower hexes = More data points in that region
Darker/purpler hexes = Fewer data points
Colorbar: Shows the log10(count) scale of frequency
Observations:
| Region on Plot | Interpretation |
|---|---|
| High concentration around BP ~120–140 & Oldpeak ~0 | Most patients had normal blood pressure and no ST depression |
| Scattered points with high Oldpeak (>2) | Fewer patients had severe exercise-induced ST depression |
| Low RestingBP & High Oldpeak | Very rare — may signal abnormal or concerning cases |
# 3D scatter plot
# importing the required libraries
import pandas as pd
import plotly.express as px
# Loading the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# Converting categorical binary features
if df['sex'].dtype == 'object':
df['sex'] = df['sex'].map({'M': 1, 'F': 0})
if df['exerciseangina'].dtype == 'object':
df['exerciseangina'] = df['exerciseangina'].map({'Y': 1, 'N': 0})
if df['fastingbs'].dtype == 'object':
df['fastingbs'] = df['fastingbs'].map({'Y': 1, 'N': 0})
# Choosing three important variables in the dataset
x_var = 'oldpeak'
y_var = 'maxhr'
z_var = 'age'
# Creating a 3D scatter plot with target as color
fig = px.scatter_3d(df, x=x_var, y=y_var, z=z_var,
color='heartdisease',
color_continuous_scale='RdBu',
title='3D Scatter Plot: Age vs MaxHR vs Oldpeak colored by Heart Disease')
fig.update_traces(marker=dict(size=4))
fig.update_layout(scene=dict(
xaxis_title=x_var,
yaxis_title=y_var,
zaxis_title=z_var
))
fig.show()
Interpretation:
This 3D scatter plot created using Plotly provides an interactive visual analysis of three important features and their relation to heart disease presence (heartdisease = 1) or absence (heartdisease = 0).
Plot details:
X-axis → oldpeak: ST depression after exercise (a heart stress indicator)
Y-axis → maxhr: Maximum heart rate achieved
Z-axis → age: Patient’s age
Color → Indicates heart disease presence:
0 (typically shown in blue) → No heart disease
1 (typically shown in red) → Heart disease
Observations:
1. Clustering You may notice distinct clusters of red and blue points:
*Heart disease cases (red) often cluster in regions with:
Higher oldpeak
Lower maxhr
Older age
No disease cases (blue) appear in:
Lower oldpeak
Higher maxhr
Younger to middle-aged individuals
2. Correlations Inverse relation between maxhr and disease status:
Low max heart rate → more likely to have heart disease.
Positive relation between oldpeak and disease:
Higher ST depression → more red points (heart disease).
Age trend:
Older patients tend to be more frequently red-coded.
| Feature | Trend in Heart Disease Patients |
|---|---|
oldpeak |
Higher |
maxhr |
Lower |
age |
Older |
# ISH chart
# importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
# Loading the dataset
df = pd.read_csv("heart.csv")
# Encoding the categorical variables
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
df_encoded = df.copy()
for col in categorical_cols:
df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])
# Calculating the importance (correlation with target)
correlations = df_encoded.corr()['HeartDisease'].drop('HeartDisease')
# Defining severity (using standard deviation as a proxy)
severity = df_encoded[correlations.index].std()
# Defining hazard level (proportion of extreme values, simplified)
hazard = {}
for col in correlations.index:
q1 = df_encoded[col].quantile(0.25)
q3 = df_encoded[col].quantile(0.75)
iqr = q3 - q1
outliers = ((df_encoded[col] < (q1 - 1.5 * iqr)) | (df_encoded[col] > (q3 + 1.5 * iqr))).sum()
hazard[col] = outliers / len(df_encoded)
# Converting to DataFrame
ish_df = pd.DataFrame({
'Feature': correlations.index,
'Importance': correlations.values,
'Severity': severity.values,
'Hazard': [hazard[f] for f in correlations.index]
})
# Plotting the ISH Chart
plt.figure(figsize=(10, 7))
scatter = plt.scatter(
ish_df['Importance'],
ish_df['Severity'],
s=ish_df['Hazard'] * 1500 + 20, # Scale size for visibility
c=ish_df['Hazard'],
cmap='Reds',
alpha=0.8,
edgecolors='black'
)
plt.xlabel("Importance (Correlation with Heart Disease)")
plt.ylabel("Severity (Standard Deviation)")
plt.title("ISH Chart of Medical Risk Factors")
plt.colorbar(label="Hazard Level (Outlier Proportion)")
# Annotating the feature names
for _, row in ish_df.iterrows():
plt.text(row['Importance'], row['Severity'], row['Feature'], fontsize=9, ha='right')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
Interpretation:
The ISH chart (Importance-Severity-Hazard) is an advanced visualization that helps evaluate features based on:
Importance: how strongly a feature correlates with the target (HeartDisease)
Severity: the variability (standard deviation) of a feature
Hazard: proportion of extreme or outlier values in the feature
Plot details:
X-axis: Feature Importance (correlation)
Y-axis: Severity Score (based on domain knowledge or variance)
Size/Color: Hazard Level (based on anomaly count or outlier proportion)
| Feature | Importance | Severity | Hazard Level | Interpretation |
|---|---|---|---|---|
| Oldpeak | High | Moderate | High | Strong predictor of heart disease, and extreme values indicate serious ST depression (ischemia). |
| ChestPainType | High | Low | Moderate | Highly predictive, but doesn’t vary as much — likely categorical. |
| ST_Slope | Moderate | Low | Low | Useful predictor, not many outliers. |
| MaxHR | Moderate | High | Moderate | Strong clinical variability, may indicate exercise tolerance issues. |
| Cholesterol | Low | High | Very High | Not a strong predictor here, but many extreme values suggest data quality issues or rare conditions. |
| RestingBP | Low | Moderate | High | Not a strong predictor on its own, but outliers may flag hypertensive emergencies. |
Features like Oldpeak and ChestPainType are highly important, and Oldpeak also carries a high hazard due to extreme clinical readings.
Cholesterol, while clinically important, shows low predictive power here — possibly due to data entry errors (e.g., zeros).
This chart helps balance statistical importance with clinical risk, making it a powerful tool for explainable AI in healthcare.
import pandas as pd
# Loading and cleaning the data
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# Converting binary categorical features to numeric features
if df['fastingbs'].dtype == 'object':
df['fastingbs'] = df['fastingbs'].map({'Y': 1, 'N': 0})
if df['exerciseangina'].dtype == 'object':
df['exerciseangina'] = df['exerciseangina'].map({'Y': 1, 'N': 0})
# Categorization of the age group
df['age_group'] = pd.cut(df['age'],
bins=[0, 40, 60, 100],
labels=['Young', 'Middle-aged', 'Senior'])
# Composite metabolic risk: cholesterol (normalized) + fastingbs
df['cholesterol_norm'] = (df['cholesterol'] - df['cholesterol'].min()) / (df['cholesterol'].max() - df['cholesterol'].min())
df['metabolic_risk'] = df['cholesterol_norm'] + df['fastingbs']
# Severe stress flag: oldpeak > 2 and exerciseangina = 1
df['severe_stress'] = ((df['oldpeak'] > 2) & (df['exerciseangina'] == 1)).astype(int)
# Preview new columns
print(df[['age', 'age_group', 'cholesterol', 'fastingbs', 'metabolic_risk', 'oldpeak', 'exerciseangina', 'severe_stress']].head())
age age_group cholesterol fastingbs metabolic_risk oldpeak \ 0 40 Young 289 0 0.479270 0.0 1 49 Middle-aged 180 0 0.298507 1.0 2 37 Young 283 0 0.469320 0.0 3 48 Middle-aged 214 0 0.354892 1.5 4 54 Middle-aged 195 0 0.323383 0.0 exerciseangina severe_stress 0 0 0 1 0 0 2 0 0 3 1 0 4 0 0
Interpretation:
We perform feature engineering by transforming and creating new variables to extract more meaningful insights from the dataset.
Steps taken:
We converted the categorical features into binary features for e.g: Converts text labels ('Y', 'N') into numeric binary (1, 0) values.
We created a new categorical feature called age_group which makes it easier to analyze heart disease prevalence by life stage.
We normalized cholestrol between 0 and 1. We combine it with fastingbs to create a composite risk factor called metabolic_risk. It helps to flag patients with high cholestrol and high BP.
We flag patient experiencing exercise-induced stress AND ST depression > 2. These conditions together may indicate critical heart risk. severe_stress = 1 means the patient likely needs further attention.
We then finally print the new columns:
age_group
cholesterol_norm
metabolic_risk
severe_stress
# importing the required dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Loading the data
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# Converting 'Y'/'N' binary columns to 1/0
if df['fastingbs'].dtype == 'object':
df['fastingbs'] = df['fastingbs'].map({'Y': 1, 'N': 0})
if df['exerciseangina'].dtype == 'object':
df['exerciseangina'] = df['exerciseangina'].map({'Y': 1, 'N': 0})
# performing Feature Engineering
df['age_group'] = pd.cut(df['age'], bins=[0, 40, 60, 100], labels=['Young', 'Middle-aged', 'Senior'])
df['cholesterol_norm'] = (df['cholesterol'] - df['cholesterol'].min()) / (df['cholesterol'].max() - df['cholesterol'].min())
df['metabolic_risk'] = df['cholesterol_norm'] + df['fastingbs']
df['severe_stress'] = ((df['oldpeak'] > 2) & (df['exerciseangina'] == 1)).astype(int)
# Plotting the graphs
plt.figure(figsize=(15, 10))
# 1. Age Group Count
plt.subplot(2, 2, 1)
sns.countplot(x='age_group', data=df, palette='Set2')
plt.title('Count of Patients by Age Group')
plt.ylabel('Number of Patients')
# 2. Violin Plot of Metabolic Risk vs Heart Disease
plt.subplot(2, 2, 2)
sns.violinplot(x='heartdisease', y='metabolic_risk', data=df, palette='coolwarm')
plt.title('Metabolic Risk by Heart Disease Status')
plt.xlabel('Heart Disease (0 = No, 1 = Yes)')
# 3. Boxplot of Cholesterol (Normalized) by Age Group
plt.subplot(2, 2, 3)
sns.boxplot(x='age_group', y='cholesterol_norm', data=df, palette='Set3')
plt.title('Cholesterol (Normalized) by Age Group')
# 4. Severe Stress vs Heart Disease
plt.subplot(2, 2, 4)
sns.countplot(x='severe_stress', hue='heartdisease', data=df, palette='Set1')
plt.title('Severe Stress vs Heart Disease')
plt.xlabel('Severe Stress (1 = Yes, 0 = No)')
plt.ylabel('Number of Patients')
# displaying the plots
plt.tight_layout()
plt.show()
IMPORTANT NOTE:
“These features were engineered during EDA to explore patterns in the data. However, they were not included in model training.”
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Load dataset
df = pd.read_csv("heart.csv")
# Clean column names
df.columns = df.columns.str.strip()
# Encode categorical variables
df_encoded = df.copy()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
if col in df_encoded.columns:
df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])
else:
print(f"⚠️ Column '{col}' not found in the dataframe!")
# Preparing features and target
X = df_encoded.drop(columns=['HeartDisease'])
y = df_encoded['HeartDisease']
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Training Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Feature importances
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(importances)
ST_Slope 0.242766 Cholesterol 0.113802 Oldpeak 0.108979 ExerciseAngina 0.103862 MaxHR 0.100141 ChestPainType 0.091722 Age 0.089734 RestingBP 0.069247 Sex 0.035929 RestingECG 0.025776 FastingBS 0.018043 dtype: float64
Interpretation:
Higher score → More influence in the decision-making process of the model.
Lower score (near 0) → Minimal or no contribution to prediction.
ST_Slope has the highest importance (0.24), it means this is the most influential factor for predicting heart disease in our dataset.
FastingBS has the lowest importance (0.018), it means this has no impact on the model's decisions as it has low variance.
Quantitative variables in our dataset:
| Column Name | Description |
|---|---|
age |
Age in years |
restingbp |
Resting blood pressure (in mm Hg) |
cholesterol |
Serum cholesterol (in mg/dL) |
maxhr |
Maximum heart rate achieved |
oldpeak |
ST depression induced by exercise |
# performing Independent t-tests for the 5 quantitative/continuous variables
from scipy.stats import ttest_ind
import pandas as pd
# Load dataset
df = pd.read_csv("heart.csv")
# Clean column names
df.columns = df.columns.str.strip()
# Split dataset based on HeartDisease outcome
group_0 = df[df['HeartDisease'] == 0]
group_1 = df[df['HeartDisease'] == 1]
# 5 Quantitative features to test
features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
ttest_results = []
# Performing t-tests
for feature in features:
stat, p = ttest_ind(group_0[feature], group_1[feature], nan_policy='omit')
ttest_results.append((feature, round(stat, 2), round(p, 3)))
# Prints results
print("\nT-Test Results (Numerical Features vs Heart Disease)\n")
print(f"{'Feature':<15} {'t-statistic':<15} {'p-value'}")
print("-" * 45)
for feature, stat, p in ttest_results:
print(f"{feature:<15} {stat:<15} {p}")
T-Test Results (Numerical Features vs Heart Disease) Feature t-statistic p-value --------------------------------------------- Age -8.9 0.0 RestingBP -3.28 0.001 Cholesterol 7.24 0.0 MaxHR 13.23 0.0 Oldpeak -13.36 0.0
Interpretation:
Null Hypothesis (H₀): There is no significant difference in the mean value of the variable between patients with and without heart disease.
Alternative Hypothesis (H₁): There is a significant difference in the mean value of the variable between the two groups.
Test Type Used: Independent two-sample t-test (Two-tailed)
Significance level: typically (α=0.05)
Observations:
| Feature | t-statistic | p-value | Significant? (p < 0.05) | Interpretation |
|---|---|---|---|---|
| Age | -8.90 | 0.000 | ✅ Yes | Heart disease patients are significantly older than non-heart patients. |
| RestingBP | -3.28 | 0.001 | ✅ Yes | Resting blood pressure is significantly lower in heart disease patients. |
| Cholesterol | 7.24 | 0.000 | ✅ Yes | Heart disease patients have significantly higher cholesterol levels. |
| MaxHR | 13.23 | 0.000 | ✅ Yes | Maximum heart rate is significantly higher in heart disease patients. (Unusual; may need to check data) |
| Oldpeak | -13.36 | 0.000 | ✅ Yes | Oldpeak is significantly lower in heart disease patients. (Also unusual, usually it increases) |
Qualitative variables in our dataset:
| Column Name | Description |
|---|---|
sex |
Biological sex of the patient (Male, Female) |
chestpaintype |
Type of chest pain: • TA – Typical Angina• ATA – Atypical Angina• NAP – Non-Anginal Pain• ASY – Asymptomatic |
restingecg |
Resting electrocardiogram results: • Normal – Normal ECG• ST – ST-T wave abnormality• LVH Left ventricular hypertrophy |
exerciseangina |
Exercise-induced angina: • 1 – Yes• 0 – No |
st_slope |
Slope of the peak exercise ST segment: • Up – Upsloping• Flat – Flat• Down – Downsloping |
# performing chi-square test for 5 categorical variables
#importing the required libraries
import pandas as pd
from scipy.stats import chi2_contingency
# Loading the dataset
df = pd.read_csv("heart.csv")
df.columns = df.columns.str.strip().str.lower() #removal of unwanted whitespaces
# Defining the categorical variables to test
categorical_vars = ['sex', 'chestpaintype', 'restingecg', 'exerciseangina', 'st_slope']
print("Chi-Square Test Results:\n")
# Performing Chi-Square test for each variable
for var in categorical_vars:
contingency_table = pd.crosstab(df[var], df['heartdisease'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Variable: {var}")
print(f"Chi2 Statistic = {chi2:.3f}, p-value = {p:.4f}, Degrees of Freedom = {dof}")
if p < 0.05:
print("➡️ Significant association with Heart Disease (Reject H0)\n")
else:
print("➡️ No significant association with Heart Disease (Fail to Reject H0)\n")
Chi-Square Test Results: Variable: sex Chi2 Statistic = 84.145, p-value = 0.0000, Degrees of Freedom = 1 ➡️ Significant association with Heart Disease (Reject H0) Variable: chestpaintype Chi2 Statistic = 268.067, p-value = 0.0000, Degrees of Freedom = 3 ➡️ Significant association with Heart Disease (Reject H0) Variable: restingecg Chi2 Statistic = 10.931, p-value = 0.0042, Degrees of Freedom = 2 ➡️ Significant association with Heart Disease (Reject H0) Variable: exerciseangina Chi2 Statistic = 222.259, p-value = 0.0000, Degrees of Freedom = 1 ➡️ Significant association with Heart Disease (Reject H0) Variable: st_slope Chi2 Statistic = 355.918, p-value = 0.0000, Degrees of Freedom = 2 ➡️ Significant association with Heart Disease (Reject H0)
Interpretation:
Null Hypothesis (H₀): The mean heart disease status (i.e., probability of having heart disease) is the same across all categories of the variable.
Alternative Hypothesis (H₁): At least one category has a different mean heart disease status, suggesting an association with heart disease.
Test Type used: Chi_square test
Significance Level: typically (α=0.05)
Note: For each categorical feature in the dataset, we formulated and tested hypotheses to evaluate its association with heart disease status.
Observations:
| Variable | Chi² Statistic | p-value | df | Significant? (p < 0.05) | Interpretation |
|---|---|---|---|---|---|
| Sex | 84.15 | 0.0000 | 1 | ✅ Yes | There is a significant association between sex and heart disease. |
| ChestPainType | 268.07 | 0.0000 | 3 | ✅ Yes | Chest pain type is strongly associated with heart disease. |
| RestingECG | 10.93 | 0.0042 | 2 | ✅ Yes | Resting ECG results are significantly related to heart disease. |
| ExerciseAngina | 222.26 | 0.0000 | 1 | ✅ Yes | Exercise-induced angina is highly associated with heart disease. |
| ST_Slope | 355.92 | 0.0000 | 2 | ✅ Yes | ST segment slope shows a very strong association with heart disease. |
import pandas as pd
from tabulate import tabulate
# Clean up column names
df.columns = df.columns.str.strip().str.lower()
# List of categorical variables (in lowercase now)
categorical_vars = ['sex', 'chestpaintype', 'restingecg', 'exerciseangina', 'st_slope']
# prints the results
for var in categorical_vars:
print(f"\nContingency Table: {var} vs heartdisease\n")
table = pd.crosstab(df[var], df['heartdisease'], margins=True)
table.columns = ['heartdisease = 0', 'heartdisease = 1', 'total']
table.index.name = var
table.reset_index(inplace=True)
print(tabulate(table, headers='keys', tablefmt='grid'))
Contingency Table: sex vs heartdisease +----+-------+--------------------+--------------------+---------+ | | sex | heartdisease = 0 | heartdisease = 1 | total | +====+=======+====================+====================+=========+ | 0 | F | 143 | 50 | 193 | +----+-------+--------------------+--------------------+---------+ | 1 | M | 267 | 458 | 725 | +----+-------+--------------------+--------------------+---------+ | 2 | All | 410 | 508 | 918 | +----+-------+--------------------+--------------------+---------+ Contingency Table: chestpaintype vs heartdisease +----+-----------------+--------------------+--------------------+---------+ | | chestpaintype | heartdisease = 0 | heartdisease = 1 | total | +====+=================+====================+====================+=========+ | 0 | ASY | 104 | 392 | 496 | +----+-----------------+--------------------+--------------------+---------+ | 1 | ATA | 149 | 24 | 173 | +----+-----------------+--------------------+--------------------+---------+ | 2 | NAP | 131 | 72 | 203 | +----+-----------------+--------------------+--------------------+---------+ | 3 | TA | 26 | 20 | 46 | +----+-----------------+--------------------+--------------------+---------+ | 4 | All | 410 | 508 | 918 | +----+-----------------+--------------------+--------------------+---------+ Contingency Table: restingecg vs heartdisease +----+--------------+--------------------+--------------------+---------+ | | restingecg | heartdisease = 0 | heartdisease = 1 | total | +====+==============+====================+====================+=========+ | 0 | LVH | 82 | 106 | 188 | +----+--------------+--------------------+--------------------+---------+ | 1 | Normal | 267 | 285 | 552 | +----+--------------+--------------------+--------------------+---------+ | 2 | ST | 61 | 117 | 178 | +----+--------------+--------------------+--------------------+---------+ | 3 | All | 410 | 508 | 918 | +----+--------------+--------------------+--------------------+---------+ Contingency Table: exerciseangina vs heartdisease +----+------------------+--------------------+--------------------+---------+ | | exerciseangina | heartdisease = 0 | heartdisease = 1 | total | +====+==================+====================+====================+=========+ | 0 | N | 355 | 192 | 547 | +----+------------------+--------------------+--------------------+---------+ | 1 | Y | 55 | 316 | 371 | +----+------------------+--------------------+--------------------+---------+ | 2 | All | 410 | 508 | 918 | +----+------------------+--------------------+--------------------+---------+ Contingency Table: st_slope vs heartdisease +----+------------+--------------------+--------------------+---------+ | | st_slope | heartdisease = 0 | heartdisease = 1 | total | +====+============+====================+====================+=========+ | 0 | Down | 14 | 49 | 63 | +----+------------+--------------------+--------------------+---------+ | 1 | Flat | 79 | 381 | 460 | +----+------------+--------------------+--------------------+---------+ | 2 | Up | 317 | 78 | 395 | +----+------------+--------------------+--------------------+---------+ | 3 | All | 410 | 508 | 918 | +----+------------+--------------------+--------------------+---------+
# Anamoly detection for all the 5 quantitative variables
import pandas as pd
# Load the dataset
df = pd.read_csv('heart.csv')
df.columns = df.columns.str.strip().str.lower()
# Convert known categorical values
if df['exerciseangina'].dtype == 'object':
df['exerciseangina'] = df['exerciseangina'].map({'Y': 1, 'N': 0})
if df['fastingbs'].dtype == 'object':
df['fastingbs'] = df['fastingbs'].map({'Y': 1, 'N': 0})
# Ensure numerical fields are parsed correctly
numerical_cols = ['age', 'restingbp', 'cholesterol', 'maxhr', 'oldpeak']
for col in numerical_cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Define rules for anomaly detection per variable
anomalies = df[
(df['age'] < 30) | (df['age'] > 80) |
(df['restingbp'] < 60) | (df['restingbp'] > 200) |
(df['cholesterol'] < 100) | (df['cholesterol'] > 400) |
(df['maxhr'] < 60) | (df['maxhr'] > 220) |
(df['oldpeak'] < 0) | (df['oldpeak'] > 6)
]
# Display anomaly summary
print(f"Detected {len(anomalies)} numerical anomalies based on clinical/statistical thresholds.")
display(anomalies)
Detected 192 numerical anomalies based on clinical/statistical thresholds.
| age | sex | chestpaintype | restingbp | cholesterol | fastingbs | restingecg | maxhr | exerciseangina | oldpeak | st_slope | heartdisease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 28 | 53 | F | ATA | 113 | 468 | 0 | Normal | 127 | 0 | 0.0 | Up | 0 |
| 30 | 53 | M | NAP | 145 | 518 | 0 | Normal | 130 | 0 | 0.0 | Flat | 1 |
| 69 | 44 | M | ASY | 150 | 412 | 0 | Normal | 170 | 0 | 0.0 | Up | 0 |
| 76 | 32 | M | ASY | 118 | 529 | 0 | Normal | 130 | 0 | 0.0 | Flat | 1 |
| 98 | 56 | M | ASY | 120 | 85 | 0 | Normal | 140 | 0 | 0.0 | Up | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 624 | 63 | F | ASY | 150 | 407 | 0 | LVH | 154 | 0 | 4.0 | Flat | 1 |
| 667 | 65 | F | NAP | 140 | 417 | 1 | LVH | 157 | 0 | 0.8 | Up | 0 |
| 796 | 56 | F | ASY | 134 | 409 | 0 | LVH | 150 | 1 | 1.9 | Flat | 1 |
| 829 | 29 | M | ATA | 130 | 204 | 0 | LVH | 202 | 0 | 0.0 | Up | 0 |
| 850 | 62 | F | ASY | 160 | 164 | 0 | LVH | 145 | 0 | 6.2 | Down | 1 |
192 rows × 12 columns
NOTE:
192 medically unusual records has been detected. We will detect and handle them using IQR method.
import pandas as pd
# Load the dataset
df = pd.read_csv("heart.csv")
df.columns = df.columns.str.lower()
# Select numerical columns
numerical_cols = ['age', 'restingbp', 'cholesterol', 'maxhr', 'oldpeak']
# Function to remove outliers using IQR
def remove_outliers_iqr(data, column):
Q1 = data[column].quantile(0.25)
Q3 = data[column].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
return data[(data[column] >= lower) & (data[column] <= upper)]
# Applying IQR cleaning to all numerical columns
cleaned_df = df.copy()
for col in numerical_cols:
cleaned_df = remove_outliers_iqr(cleaned_df, col)
print(f"Original rows: {len(df)}")
print(f"After outlier removal: {len(cleaned_df)}")
Original rows: 918 After outlier removal: 701
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the dataset
df = pd.read_csv("heart.csv")
# List of columns to clean and visualize
columns_to_clean = ['Age', 'RestingBP', 'MaxHR', 'Oldpeak', 'Cholesterol']
# Function to remove outliers using IQR method
def remove_outliers_iqr(data, column):
Q1 = data[column].quantile(0.25)
Q3 = data[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
# Creating a subplot grid: one row per feature, two columns
fig, axes = plt.subplots(len(columns_to_clean), 2, figsize=(12, 18))
for i, col in enumerate(columns_to_clean):
# Before outlier removal
sns.boxplot(x=df[col], ax=axes[i, 0], color='tomato')
axes[i, 0].set_title(f'{col} - Before Outlier Removal')
# After outlier removal
cleaned = remove_outliers_iqr(df, col)
sns.boxplot(x=cleaned[col], ax=axes[i, 1], color='mediumseagreen')
axes[i, 1].set_title(f'{col} - After Outlier Removal')
# displaying the results
plt.tight_layout()
plt.show()
import pandas as pd
# Load dataset
df = pd.read_csv("heart.csv")
df.columns = df.columns.str.strip().str.lower()
# Handling categorical encodings
if df['exerciseangina'].dtype == 'object':
df['exerciseangina'] = df['exerciseangina'].map({'Y': 1, 'N': 0})
if df['fastingbs'].dtype == 'object':
df['fastingbs'] = df['fastingbs'].map({'Y': 1, 'N': 0})
# Converting categorical-like variables
categorical_cols = ['sex', 'chestpaintype', 'fastingbs', 'restingecg', 'exerciseangina', 'st_slope']
# Detect rare values (frequency < 5% of total)
rare_flags = []
total_rows = len(df)
for col in categorical_cols:
rare_values = df[col].value_counts(normalize=True)
rare_categories = rare_values[rare_values < 0.05].index.tolist()
# Flag rows with rare categories
flagged = df[df[col].isin(rare_categories)]
if not flagged.empty:
flagged['flagged_column'] = col
flagged['rare_value'] = flagged[col]
rare_flags.append(flagged)
# Combining all flagged rows
if rare_flags:
rare_cases = pd.concat(rare_flags, ignore_index=True)
print(f"Detected {len(rare_cases)} rows with rare categorical values.")
display(rare_cases[['flagged_column', 'rare_value'] + categorical_cols + ['heartdisease']])
else:
print("No rare categorical patterns detected.")
NOTE:
No rare/outliers have been detected in the categorical features.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Loading the dataset
df = pd.read_csv("heart.csv")
df.columns = df.columns.str.lower() # lowercase column names
# Detecting categorical columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
# Manually add likely categorical variables
likely_categoricals = ['sex', 'chestpaintype', 'fastingbs', 'restingecg', 'exerciseangina', 'st_slope']
for col in likely_categoricals:
if col in df.columns and col not in cat_cols:
cat_cols.append(col)
# Converting to category dtype
for col in cat_cols:
df[col] = df[col].astype('category')
# Creating a new column to track anomalies
df['categorical_anomaly'] = False
# Total number of rows
total_rows = len(df)
# Plot
sns.set(style="whitegrid")
plt.figure(figsize=(18, 15))
plot_num = 1
for col in cat_cols:
value_counts = df[col].value_counts(dropna=False)
rare_categories = value_counts[value_counts / total_rows < 0.01].index.tolist()
# If rare categories exist, mark them as anomaly
if rare_categories:
is_rare = df[col].isin(rare_categories)
df.loc[is_rare, 'categorical_anomaly'] = True
# Also flag missing values
df.loc[df[col].isna(), 'categorical_anomaly'] = True
# Plot
plt.subplot(3, 3, plot_num)
sns.countplot(data=df, x=col, hue=is_rare if rare_categories else df[col], palette="Set2")
plt.title(f"{col} (Red = Rare <1%)")
plot_num += 1
if plot_num > 9:
break
plt.tight_layout()
plt.show()
# Show flagged rows
flagged = df[df['categorical_anomaly']]
print("\nFlagged Categorical Anomalies:")
print(flagged[['categorical_anomaly'] + cat_cols].head())
Flagged Categorical Anomalies: Empty DataFrame Columns: [categorical_anomaly, sex, chestpaintype, restingecg, exerciseangina, st_slope, fastingbs] Index: []
# importing the required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Loading the dataset
df = pd.read_csv("heart.csv")
# Encoding categorical features
df_encoded = df.copy()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
le = LabelEncoder()
df_encoded[col] = le.fit_transform(df[col])
# Splitting features and target
X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Train Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
# Evaluation
print("=== Logistic Regression Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
=== Logistic Regression Results ===
Accuracy: 0.8478
Confusion Matrix:
[[68 9]
[19 88]]
Classification Report:
precision recall f1-score support
0 0.78 0.88 0.83 77
1 0.91 0.82 0.86 107
accuracy 0.85 184
macro avg 0.84 0.85 0.85 184
weighted avg 0.85 0.85 0.85 184
# importing the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
# Loading the dataset
df = pd.read_csv("heart.csv")
# Encoding categorical features
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
df[col] = LabelEncoder().fit_transform(df[col])
for col in ['Cholesterol', 'RestingBP', 'Oldpeak', 'MaxHR']:
df = remove_outliers_iqr(df, col)
# SPLIT AND BALANCE DATA
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']
# Balance using SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
# FEATURE SCALING
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# MODEL TUNING
param_grid = {
'C': [0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2'],
'solver': ['liblinear']
}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
# EVALUATION
y_pred = grid.predict(X_test)
print("Best Params:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
Best Params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.851063829787234
Classification Report:
precision recall f1-score support
0 0.86 0.85 0.85 71
1 0.85 0.86 0.85 70
accuracy 0.85 141
macro avg 0.85 0.85 0.85 141
weighted avg 0.85 0.85 0.85 141
Confusion Matrix:
[[60 11]
[10 60]]
C:\Users\Sharon Karabel\anaconda3\lib\site-packages\sklearn\base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.
Interpretation:
We have achieved accuracy of 84.78 before and now after applying the gridsearchcv, we got 85.10.
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
# Load the dataset
df = pd.read_csv("heart.csv")
# Encode categorical variables
df_encoded = df.copy()
label_encoders = {}
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
le = LabelEncoder()
df_encoded[col] = le.fit_transform(df[col])
label_encoders[col] = le
# Defining features and target
X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]
# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# ----- Hyperparameter Tuning -----
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)
# Best model
best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test)
# Evaluation
print("=== Random Forest Classifier Results (Tuned) ===")
print(f"Best Parameters: {grid.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
=== Random Forest Classifier Results (Tuned) ===
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy: 0.8804
Confusion Matrix:
[[66 11]
[11 96]]
Classification Report:
precision recall f1-score support
0 0.86 0.86 0.86 77
1 0.90 0.90 0.90 107
accuracy 0.88 184
macro avg 0.88 0.88 0.88 184
weighted avg 0.88 0.88 0.88 184
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFECV
# Load the data
df = pd.read_csv("heart.csv")
# Encode categorical variables
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
df[col] = LabelEncoder().fit_transform(df[col])
# Outlier removal using IQR
def remove_outliers(df, col):
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
return df[(df[col] >= lower) & (df[col] <= upper)]
for col in ['Cholesterol', 'RestingBP', 'Oldpeak', 'MaxHR']:
df = remove_outliers(df, col)
# splitting the Feature and target
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Recursive Feature Elimination with Cross Validation
base_rf = RandomForestClassifier(random_state=42)
rfecv = RFECV(estimator=base_rf, step=1, cv=5, scoring='accuracy')
rfecv.fit(X_scaled, y)
# Reduced features
X_selected = rfecv.transform(X_scaled)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
# Hyperparameter grid
param_grid = {
'n_estimators': [200, 300, 500],
'max_depth': [10, 15, 20],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
'class_weight': ['balanced'] # Handling imbalance internally
}
grid = GridSearchCV(RandomForestClassifier(random_state=42),
param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)
# Evaluation
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print("=== Final Random Forest Model ===")
print("Best Params:", grid.best_params_)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
=== Final Random Forest Model ===
Best Params: {'class_weight': 'balanced', 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 0.8936
Confusion Matrix:
[[61 10]
[ 5 65]]
Classification Report:
precision recall f1-score support
0 0.92 0.86 0.89 71
1 0.87 0.93 0.90 70
accuracy 0.89 141
macro avg 0.90 0.89 0.89 141
weighted avg 0.90 0.89 0.89 141
Interpretation:
We have increased the accuracy from 88.04 to 89.36 by applying IQR and scaling the features.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
# Loading the dataset
df = pd.read_csv("heart.csv") # Adjust path if needed
# Encode categorical features
df_encoded = df.copy()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
le = LabelEncoder()
df_encoded[col] = le.fit_transform(df[col])
# Define features and target
X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize XGBoost model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
# Make predictions
y_pred = xgb.predict(X_test)
# Evaluation of model
print("=== XGBoost Classifier Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
=== XGBoost Classifier Results ===
Accuracy: 0.8696
Confusion Matrix:
[[69 8]
[16 91]]
Classification Report:
precision recall f1-score support
0 0.81 0.90 0.85 77
1 0.92 0.85 0.88 107
accuracy 0.87 184
macro avg 0.87 0.87 0.87 184
weighted avg 0.87 0.87 0.87 184
C:\Users\Sharon Karabel\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning:
[01:50:11] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier, plot_importance
import matplotlib.pyplot as plt
# Load dataset
df = pd.read_csv("heart.csv")
# Encode categorical features
df_encoded = df.copy()
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
label_encoder = LabelEncoder()
df_encoded[col] = label_encoder.fit_transform(df_encoded[col])
# Define features and target
x = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]
# Scale numerical features
numeric_cols = x.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
x[numeric_cols] = scaler.fit_transform(x[numeric_cols])
# Stratified train-test split
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.2, stratify=y, random_state=42
)
# Initialize and train XGBoost model
xgb_model = XGBClassifier(
n_estimators=300,
learning_rate=0.03,
max_depth=4,
subsample=0.8,
colsample_bytree=0.8,
use_label_encoder=False,
eval_metric='logloss',
random_state=42
)
xgb_model.fit(x_train, y_train)
# Make predictions
y_pred = xgb_model.predict(x_test)
# Evaluation
print("=== XGBoost Classifier Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
=== XGBoost Classifier Results ===
Accuracy: 0.8967
Confusion Matrix:
[[73 9]
[10 92]]
Classification Report:
precision recall f1-score support
0 0.88 0.89 0.88 82
1 0.91 0.90 0.91 102
accuracy 0.90 184
macro avg 0.90 0.90 0.90 184
weighted avg 0.90 0.90 0.90 184
C:\Users\Sharon Karabel\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning:
[01:50:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
Interpretation:
In the beginning we got an accuracy of 86.96 using XGBoost model.
After implementing XGBClassifierfit() and early_stoppings, we got higher accuracy of 89.67
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# Load data
df = pd.read_csv("heart.csv")
# Encode categorical variables
df_encoded = df.copy()
label_encoders = {}
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in categorical_cols:
le = LabelEncoder()
df_encoded[col] = le.fit_transform(df[col])
label_encoders[col] = le # Save encoder if needed later
# Split data into features and target
X = df_encoded.drop('HeartDisease', axis=1)
y = df_encoded['HeartDisease']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train Gaussian Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
# Evaluation
print("=== Naive Bayes Classifier Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
=== Naive Bayes Classifier Results ===
Accuracy: 0.8424
Confusion Matrix:
[[65 12]
[17 90]]
Classification Report:
precision recall f1-score support
0 0.79 0.84 0.82 77
1 0.88 0.84 0.86 107
accuracy 0.84 184
macro avg 0.84 0.84 0.84 184
weighted avg 0.84 0.84 0.84 184
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
# Load dataset
df = pd.read_csv("heart.csv") # Ensure correct path
# Encode categorical variables
cat_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in cat_cols:
df[col] = LabelEncoder().fit_transform(df[col])
# Define X and y
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
# Define models
models = {
"Logistic Regression": LogisticRegression(max_iter=1000),
"Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
"Naive Bayes": GaussianNB()
}
# Evaluate each model
metrics = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
metrics[name] = {
"Accuracy": round(accuracy_score(y_test, y_pred), 4),
"Precision": round(precision_score(y_test, y_pred), 4),
"Recall": round(recall_score(y_test, y_pred), 4),
"F1 Score": round(f1_score(y_test, y_pred), 4),
"ROC AUC": round(roc_auc_score(y_test, y_proba), 4),
"Confusion Matrix": confusion_matrix(y_test, y_pred).tolist() # as list for readability
}
# Create DataFrame for table
performance_table = pd.DataFrame.from_dict(metrics, orient='index')
performance_table.reset_index(inplace=True)
performance_table.rename(columns={'index': 'Model'}, inplace=True)
# Display table
print(performance_table)
Model Accuracy Precision Recall F1 Score ROC AUC \
0 Logistic Regression 0.8696 0.8482 0.9314 0.8879 0.8957
1 Random Forest 0.8913 0.8868 0.9216 0.9038 0.9298
2 XGBoost 0.8750 0.8911 0.8824 0.8867 0.9254
3 Naive Bayes 0.8913 0.8942 0.9118 0.9029 0.9280
Confusion Matrix
0 [[65, 17], [7, 95]]
1 [[70, 12], [8, 94]]
2 [[71, 11], [12, 90]]
3 [[71, 11], [9, 93]]
C:\Users\Sharon Karabel\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning:
[01:50:27] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
Interpretation:
Random Forest and Naive Bayes both achieved the highest accuracy (89.13%), with strong recall and F1 scores (above 0.90), making them highly effective even without addressing class imbalance.
Naive Bayes has the highest precision (0.8942) among all, showing good ability to avoid false positives, and its ROC AUC (0.9280) is competitive with Random Forest.
XGBoost performs reasonably well, but its recall (0.8824) is slightly lower than the top two, making it third-best in this setting.
Logistic Regression shows high recall (0.9314)—meaning it identifies most positive cases—but has the lowest precision and ROC AUC, indicating more false positives and relatively weaker probability calibration.
Conclusion:
Random Forest and Naive Bayes are the top contenders with balanced and high performance across all metrics. However, Random Forest slightly edges out in terms of overall AUC and F1, making it the preferred choice.
Logistic Regression, while interpretable, performs slightly worse overall, especially in terms of precision and AUC. Balancing the dataset (as you did with SMOTE later) helps refine these models further, particularly boosting Naive Bayes and reducing Logistic Regression's bias toward the majority class.
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Load dataset
df = pd.read_csv("heart.csv")
# Encode categorical columns
cat_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in cat_cols:
df[col] = LabelEncoder().fit_transform(df[col])
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Define models
models = {
"Logistic Regression": LogisticRegression(max_iter=1000),
"Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
"Naive Bayes": GaussianNB()
}
# Evaluate models
results = {}
for name, model in models.items():
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
results[name] = {
"Accuracy": round(accuracy_score(y_test, y_pred), 4),
"Precision": round(precision_score(y_test, y_pred), 4),
"Recall": round(recall_score(y_test, y_pred), 4),
"F1 Score": round(f1_score(y_test, y_pred), 4),
"ROC AUC": round(roc_auc_score(y_test, y_proba), 4),
"Confusion Matrix": confusion_matrix(y_test, y_pred).tolist()
}
# Convert to table
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.reset_index(inplace=True)
results_df.rename(columns={'index': 'Model'}, inplace=True)
# Print the performance table
print(results_df)
C:\Users\Sharon Karabel\anaconda3\lib\site-packages\sklearn\base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.
Model Accuracy Precision Recall F1 Score ROC AUC \
0 Logistic Regression 0.8587 0.8654 0.8824 0.8738 0.9034
1 Random Forest 0.9076 0.9126 0.9216 0.9171 0.9380
2 XGBoost 0.8750 0.8911 0.8824 0.8867 0.9283
3 Naive Bayes 0.8967 0.8952 0.9216 0.9082 0.9316
Confusion Matrix
0 [[68, 14], [12, 90]]
1 [[73, 9], [8, 94]]
2 [[71, 11], [12, 90]]
3 [[71, 11], [8, 94]]
C:\Users\Sharon Karabel\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning:
[01:50:31] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
Interpretation:
Random Forest outperforms all other models with the highest accuracy (90.76%), F1 Score (91.71%), and ROC AUC (0.9380). It balances both precision and recall, making it the best-performing model overall after applying SMOTE.
Naive Bayes significantly improves after SMOTE. Its F1 score (90.82%) and ROC AUC (0.9316) are second only to Random Forest, indicating strong generalization on the balanced data.
XGBoost also performs well but ranks third. While it retains a high ROC AUC (0.9283), its F1 Score (88.67%) is slightly lower than the top two.
Logistic Regression remains the least performing model post-SMOTE, although it still maintains decent metrics (F1 Score: 87.38%, ROC AUC: 0.9034). It’s a good interpretable baseline, but lags behind in overall accuracy.
Conclusion:
After handling class imbalance using SMOTE, all models show noticeable improvement. However, Random Forest emerges as the top performer, offering the best trade-off between accuracy, precision, recall, and AUC. Naive Bayes also becomes a surprisingly competitive model. While XGBoost and Logistic Regression are still effective, they are slightly behind in precision and F1 performance. Thus, Random Forest is the recommended model for predicting heart disease in this context.
import plotly.graph_objects as go
fig = go.Figure(go.Indicator(
mode="gauge+number",
value=90.76,
number={'suffix': "%"},
title={'text': "Model Accuracy of Random Forest Model", 'font': {'size': 18}},
gauge={
'axis': {'range': [0, 100]},
'bar': {'color': "darkblue"},
'steps': [
{'range': [0, 60], 'color': 'red'},
{'range': [60, 80], 'color': 'yellow'},
{'range': [80, 100], 'color': 'lightgreen'}
],
'threshold': {
'line': {'color': "black", 'width': 4},
'thickness': 0.75,
'value': 90}
}
))
# Set smaller size
fig.update_layout(
width=400,
height=300,
margin=dict(l=20, r=20, t=50, b=20)
)
fig.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# Load and preprocess data
df = pd.read_csv("heart.csv")
cat_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for col in cat_cols:
df[col] = LabelEncoder().fit_transform(df[col])
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
# Train best model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
# Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
# Plot ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='red', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
Interpretation:
The model is well-calibrated and separates positive (HeartDisease=1) and negative (HeartDisease=0) classes effectively.
The Random Forest model achieves high diagnostic accuracy for heart disease, as shown by a high ROC AUC score. This makes it a reliable model for identifying at-risk individuals in dataset.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from lime.lime_tabular import LimeTabularExplainer
# Load and encode data
df = pd.read_csv("heart.csv")
df_encoded = df.copy()
# Encode categorical variables
for col in df_encoded.select_dtypes(include='object').columns:
df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])
X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Set up LIME explainer
explainer = LimeTabularExplainer(
training_data=X_train.values,
feature_names=X.columns.tolist(),
class_names=['No Disease', 'Disease'],
mode='classification'
)
# Explain one prediction
i = 5 # Index of test instance
exp = explainer.explain_instance(X_test.iloc[i].values, model.predict_proba, num_features=6)
# Show explanation in notebook
exp.show_in_notebook(show_table=True)
C:\Users\Sharon Karabel\anaconda3\lib\site-packages\sklearn\utils\validation.py:2739: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names
Interpretation:
Actual Predicted Probabilities:
No Disease: 34%
Disease: 66%
Features Supporting "Disease" (Orange Bars)
| Feature | Contribution | Explanation |
|---|---|---|
ST_Slope <= 1.0 |
+0.33 | A flat ST slope is strongly associated with disease. |
ChestPainType <= 0.0 |
+0.14 | Encoded type 0 may be typical angina — likely indicating heart issues. |
FastingBS > 0.0 |
+0.07 | High fasting blood sugar (i.e., diabetes risk) contributes to disease likelihood. |
Features Supporting "No Disease" (Blue Bars)
| Feature | Contribution | Explanation |
|---|---|---|
ExerciseAngina <= 0.0 |
-0.13 | No exercise-induced angina reduces disease likelihood. |
0.00 < Oldpeak <= 0.6 |
-0.09 | Low ST depression suggests less heart strain — healthy sign. |
MaxHR > 155.0 |
-0.06 | High maximum heart rate usually reflects good fitness levels. |
Feature Values (Bottom Table)
| Feature | Value | Meaning |
|---|---|---|
ST_Slope |
1.00 | Flat slope (possibly unhealthy) |
ChestPainType |
0.00 | Encoded pain type (e.g., typical angina) |
ExerciseAngina |
0.00 | No angina during exercise |
Oldpeak |
0.20 | Very low ST depression — good |
FastingBS |
1.00 | Fasting blood sugar is high (risk) |
MaxHR |
163.00 | Very high HR — typically a healthy sign |
Observations:
The model predicted "Disease" with 66% confidence, mainly because:
The patient has flat ST slope and typical chest pain.
Fasting blood sugar is high, a risk factor.
However, the model also noted healthy indicators:
No angina during exercise.
High max heart rate and low ST depression.
The LIME explanation shows that the model predicts Heart Disease (66% probability) due to key risk indicators such as a flat ST slope, typical chest pain, and high fasting blood sugar. Although some features like no exercise-induced angina and high maximum heart rate suggest good health, the risk factors outweigh the healthy signs, leading to the prediction.
from IPython.display import display, HTML
display(HTML('''
<a href="https://heart-disease-risk-prediction-using-patient-records-aqxmiml4qu.streamlit.app/" target="_blank">
<button style="
background-color: #F08080;
border: none;
color: black;
padding: 12px 24px;
text-align: center;
text-decoration: none;
display: inline-block;
font-size: 16px;
border-radius: 6px;
cursor: pointer;">
❤️ Open Heart Disease Prediction App
</button>
</a>
'''))
IMPORTANT NOTE:
I have created an interactive AI doctor app where we enter the patient details and the AI bot gives us advices on medications, exercise, food and tests need to be taken. And we can download the prescription in PDF format.
To access the app please click on the button : Open Heart Disease Prediction App above. This will take you to another tab and you can access it.